第3章 メカニズムをとらえるデータ可視化

図の右上のshowボタンを押すとRのコードが表示されます。

3.1.1

library(conflicted)
library(tidyverse)
library(patchwork)

my_plot <- function(data, title){
  data |>
    ggplot(aes(x = val, fill = name)) +
    geom_histogram(position = "identity",alpha=0.5, color = "black") +
    labs(title = title) +
    theme(
      legend.position="none", 
      axis.title.x=element_blank(), 
      axis.title.y=element_blank()
    )
}

# 中央傾向:平均値が0と10の二つの分布
# 平均0、標準偏差1の正規分布に従う乱数を1000個生成
# 平均5、標準偏差1の正規分布に従う乱数を1000個生成
set.seed(0)
p1 <- bind_rows(
  data.frame(val = rnorm(1000, mean = 0, sd = 1), name = "a"),
  data.frame(val = rnorm(1000, mean = 5, sd = 1), name = "b")
  ) |>
  my_plot("ピークの位置")

# 分散:分散が大きい分布と小さい分布
# 平均5、標準偏差1の正規分布に従う乱数を1000個生成
# 平均5、標準偏差3の正規分布に従う乱数を1000個生成
set.seed(0)
p2 <- bind_rows(
  data.frame(val = rnorm(1000, mean = 5, sd = 1), name = "a"),
  data.frame(val = rnorm(1000, mean = 5, sd = 3), name = "b")
  ) |>
  my_plot("広がりの度合い")

# 形状:正規分布、左寄せの分布
# 平均5、標準偏差1の正規分布に従う乱数を1000個生成
# 形状パラメータ2、尺度パラメータ2のガンマ分布に従う乱数を1000個生成
set.seed(0)
p3 <- bind_rows(
  data.frame(val = rnorm(1000, mean = 5, sd = 1) , name = "a"),
  data.frame(val = rgamma(1000, shape = 2, scale = 2) , name = "b")
) |>
  my_plot("分布の形状")

# ピーク:ピーク一つとピーク二つの分布
# 平均5、標準偏差1の正規分布に従う乱数を1000個生成
# 平均3、標準偏差1の正規分布と平均7、標準偏差1の正規分布に従う乱数を500個ずつ生成
set.seed(0)
p4 <- bind_rows(
  data.frame(val = rnorm(1000, mean = 5, sd = 1) , name = "a"),
  data.frame(val = c(rnorm(500, mean = 3, sd = 1), rnorm(500, mean = 7, sd = 1)), name = "b")
) |>
  my_plot("ピークの数と位置")

# 外れ値:ごく少量の大きい外れ値が含まれている分布と含まれていない分布
# 平均5、標準偏差1の正規分布に従う乱数を995個と平均50、標準偏差5の正規分布に従う乱数を5個生成
# 平均5、標準偏差1の正規分布に従う乱数を1000個生成
set.seed(0)
p5 <- bind_rows(
  data.frame(val = c(rnorm(995, mean = 5, sd = 1), rnorm(5, mean = 50, sd = 5)), name = "a"),
  data.frame(val = rnorm(1000, mean = 5, sd = 1), name = "b")
) |>
  my_plot("外れ値")

# 外れ値:外れ値が多少ある分布とほぼない分布
# 平均5、標準偏差1の正規分布に従う乱数を1000個生成
# 平均5、標準偏差1の正規分布に従う乱数を975個と平均15、標準偏差1の正規分布に従う乱数を25個生成
set.seed(0)
p6 <- bind_rows(
  data.frame(val = rnorm(1000, mean = 5, sd = 1) , name = "a"),
  data.frame(val = c(rnorm(975, mean = 5, sd = 1), rnorm(25, mean = 15, sd = 1)), name = "b")
) |>
  my_plot("外れ値?")

(p1 + p2 + p3) / (p4 + p5 + p6)

3.1.2
library(conflicted)
library(tidyverse)
library(patchwork)

set.seed(0)
data_df <- data.frame(data = rnorm(100, 0, 1)) # 観測結果をデータフレームに変換

my_plot2 <- function(data, title, binwidth = 0.4, boundary = 0){
  data |>
    ggplot(aes(x=data)) +
    geom_histogram(
      aes(y = after_stat(density)), fill = "blue",
      binwidth = binwidth, boundary = boundary
      ) +
    stat_function(fun=dnorm, color="red", linewidth = 1, linetype=2, args=list(mean=0)) + 
    labs(title = title) +
    theme(
      legend.position="none", 
      axis.title.x=element_blank(), 
      axis.title.y=element_blank(),
      aspect.ratio = 1
    )
}

p1 <- data_df |>
  my_plot2("ビン幅 = 0.8", binwidth =0.8)

p2 <- data_df |>
  my_plot2("ビン幅 = 0.4", binwidth =0.4)

p3 <- data_df |>
  my_plot2("ビン幅 = 0.08", binwidth =0.08)

p4 <- data_df |>
  my_plot2("boundary = 0", boundary = 0)

p5 <- data_df |>
  my_plot2("boundary = 0.1", boundary = 0.1)

p6 <- data_df |>
  my_plot2("boundary = 0.2", binwidth =0.2)

(p1 + p2 + p3) / (p4 + p5 + p6)

3.1.3

元は1000回となっているがキレイな分布にならないので10万回に増やしてます。

library(conflicted)
library(tidyverse)
library(mclust)
library(MASS)
library(patchwork)

# サイコロを100回振った時の出目の総和を10万回計算
roll_dice <- function(n = 100, repeat_num = 100000) {
  rowSums(
    matrix(sample.int(6, size = n*repeat_num, replace = TRUE), ncol = n)
    )
}

# ヒストグラムの作成
set.seed(0)
hist_data <- roll_dice()

p1 <- data.frame(x = hist_data) |>
  ggplot(aes(x=x)) +
  geom_histogram(aes(y=after_stat(density)), binwidth=1) +
  stat_function(fun=dnorm, args=list(mean=mean(hist_data), sd=sd(hist_data)),
                color='red', linetype="dashed") +
  labs(title = "データを正規分布でフィッティング",
       x="サイコロを100回振って出た目の総和", y="相対頻度") +
  theme(aspect.ratio = 4/5)

# ピーク一つとピーク二つの分布
#平均3, 標準偏差1 と 平均7, 標準偏差1 の正規分布に従う乱数を500個ずつ生成
set.seed(0)
data4_b <- c(
  rnorm(500, mean = 3, sd = 1), 
  rnorm(500, mean = 7, sd = 1)
  )

# 混合ガウシアンモデルのパラメータを推定
gmm <- Mclust(data4_b, G=2)

# データと混合ガウシアンモデルの結果をプロット
p2 <- data.frame(x = data4_b) |>
  ggplot(aes(x=x)) +
  geom_histogram(aes(y=after_stat(density)), binwidth=0.2, colour="black", fill="greenyellow") +
  stat_function(
    fun = function(x) gmm$parameters$pro[1] * dnorm(x, gmm$parameters$mean[1], sqrt(gmm$parameters$variance$sigma)),
    linetype = "dashed",
    color = "red"
    ) +
  stat_function(
    fun = function(x) gmm$parameters$pro[2]*dnorm(x, gmm$parameters$mean[2], sqrt(gmm$parameters$variance$sigma)),
    linetype = "dashed",
    color = "blue"
    ) +
  labs(
    titel = "データから2つの正規分布を推定",
    x = expression(paste("観測値 ", italic("X")))
    ) +
  theme(
    aspect.ratio = 4/5,
    axis.title.y=element_blank()
    )

# サンプルを生成
set.seed(0)
lognorm_samples <- rlnorm(10000, meanlog = 0.954, sdlog = 0.65)
log_lognorm_samples <- log(lognorm_samples) # 対数変換

# 平均と標準偏差を推定
lognorm_fit <- fitdistr(log_lognorm_samples, densfun = "normal")

p3 <- data.frame(x = lognorm_samples) |>
  ggplot(aes(x = x)) +
  geom_histogram(aes(y = after_stat(density))) +
  labs(x = "観測値 X", y = "相対頻度") +
  theme(aspect.ratio = 4/5)
  
p4 <- data.frame(x = log_lognorm_samples) |>
  ggplot(aes(x = x)) +
  geom_histogram(aes(y = after_stat(density))) +
  labs(x = "log X", y = "相対頻度") +
  stat_function(fun=dnorm, color="red", size = 1, linetype=2,
    args=list(mean=lognorm_fit$estimate[1], sd=lognorm_fit$estimate[2])
    )  +
  theme(aspect.ratio = 4/5)

(p1 + p2) / (p3 + p4)

3.1.4

library(conflicted)
library(tidyverse)
library(patchwork)

# 描画用一時関数
my_plot_line <- function(df, color, title) {
    ggplot(df, aes(x = x, y = y)) +
    geom_line(color = color, linewidth = 2) +
    labs(title = title) +
    theme(
      axis.title.x=element_blank(),
      axis.title.y=element_blank()
    )
}

my_plot_point <- function(df, color, title) {
  ggplot(df, aes(x = x, y = y)) +
    geom_point(color = color) +
    labs(title = title) +
    theme(
      axis.title.x=element_blank(),
      axis.title.y=element_blank()
    )
}

# 一様分布
uni_p <- tibble(
  x = seq(-1, 1, length.out = 1000),
  y = dunif(x, min = -1, max = 1)
  ) |>
  my_plot_line("orange", "一様分布") 

# 正規分布
norm_p <- tibble(
  x = seq(-4, 4, length.out = 1000),
  y = dnorm(x)
)  |>
  my_plot_line("red", "正規分布") 

# 二項分布
bin_p <- tibble(
  x = 0:10,
  y = dbinom(0:10, size=10, prob=0.5)
  ) |>
  my_plot_point("blue", "二項分布") 

# ポアソン分布
poi_p <- tibble(
  x = 0:9, 
  y = dpois(0:9, lambda = 3)
) |>
  my_plot_point("forestgreen", "ポアソン分布") 

# 指数分布
exp_p <- tibble(
  x = seq(0, 10, length.out = 1000),
  y = dexp(x)
)  |>
  my_plot_point("purple", "指数分布") 

# ガンマ分布
gam_p <- tibble(
  x = seq(0, 10, length.out = 1000),
  y = dgamma(x, 5)
) |>
  my_plot_point("brown", "ガンマ分布") 

# ワイブル分布
wei_p <- tibble(
  x = seq(0.01, 2, length.out = 1000),
  y = dweibull(x, 1.5)
)  |>
  my_plot_point("grey", "ワイブル分布") 
  
# 対数正規分布
ln_p <- tibble(
  x = seq(0.01, 10, length.out = 1000),
  y = dlnorm(x)
)  |>
  my_plot_line("pink", "対数正規分布") 

# パレート分布
paretofunc <- function(x, alpha = 5, xm = 1) alpha * (xm^alpha) / (x^(alpha+1))
pare_p <- tibble(
  x = seq(1, 4, length.out = 1000),
  y = paretofunc(x)
)  |>
  my_plot_line("cyan", "パレート分布") 

# グラフを並べて表示
uni_p + norm_p + bin_p + poi_p + exp_p + gam_p + wei_p + ln_p + pare_p +
  plot_layout(ncol = 2)

3.1.5
library(conflicted)
library(tidyverse)

# 正規分布から100点と500点のサンプルを生成
set.seed(0)
samples <- rnorm(600)

p1 <- tibble(
  x = samples[1:100],
  fill = if_else(x >= quantile(x, 0.8), "80%点以上", "80%点未満")
  ) |>
  ggplot(aes(x = x, fill = fill)) +
  geom_histogram() +
  labs(x = "観測値", y = "頻度")
 
p2 <- tibble(x = samples[1:100]) |>
  ggplot(aes(x = x)) +
  stat_ecdf(pad = FALSE, color = "blue") +
  stat_function(fun=pnorm, color = "red", linetype = 2) +
  labs(title = "サンプルサイズ=100", x = "観測値", y = "累積相対頻度")

p3 <- tibble(x = samples[101:500]) |>
  ggplot(aes(x = x)) +
  stat_ecdf(pad = FALSE, color = "blue") +
  stat_function(fun=pnorm, color = "red", linetype = 2) +
  labs(title = "サンプルサイズ=500", x = "観測値", y = "累積相対頻度")

(p1 + plot_spacer()) / (p2 + p3)

3.2.1

library(conflicted)
library(tidyverse)
library(patchwork)

df <- read_csv(
  "https://www.esri.cao.go.jp/jp/sna/data/data_list/sokuhou/files/2023/qe233_2/tables/gaku-jfy2332.csv",
  col_names = FALSE, skip = 7, col_types = "cnn__nn_nn_______________________"
  ) |>
  drop_na() |>
  mutate( # 各金額を兆円単位に変換
    年度 = str_remove(X1, "/4-3.") |> as.integer(),
    `国内総生産(支出側)` = X2 / 1000,
    民間最終消費支出 = X3 / 1000,
    民間住宅 = X6 / 1000,
    民間企業設備 = X7 / 1000,
    政府最終消費支出 = X9 / 1000,
    公的固定資本形成 = X10 / 1000,
    .keep = "unused"
    )

p1 <- df |>
  ggplot(aes(x = 年度, y = `国内総生産(支出側)`)) +
  geom_line(color="skyblue") +
  labs(title = "折れ線グラフ", y = "国内総生産(支出側) [兆円]") +
  theme(axis.title.x=element_blank())

p2 <- df |>
  ggplot(aes(x = 年度, y = `国内総生産(支出側)`)) +
  geom_area(fill="skyblue") +
  labs(title = "エリアチャート", x = "年度") +
  theme(axis.title.y=element_blank())

p3 <- df |>
  dplyr::select(!`国内総生産(支出側)`) |>
  pivot_longer(!年度, names_to = "sector", values_to = "val") |>
  mutate(
    sector = factor(sector, levels = rev(c("民間最終消費支出", "民間住宅", "民間企業設備", "政府最終消費支出", "公的固定資本形成")))
    ) |>
  ggplot(aes(x = 年度, y = val, fill = sector)) +
  geom_area() +
  labs(title = "積み上げエリアチャート", x = "年度", y = "国内総生産(支出側) [兆円]", fill = "") +
  theme(
    axis.title.x=element_blank(),
    axis.title.y=element_blank()
    )

p1 + p2 + p3

3.2.2
library(conflicted)
library(tidyverse)

n_days <- 30  # 30日間
n_individuals <- 30  # 個体数
half_n <- as.integer(n_individuals / 2)  # 個体数の半分

# 各個体の活動量期待値を生成
set.seed(0)
expected_values <- c(
  rnorm(half_n, mean = 50, sd = 5),# 期待値50、標準偏差5の正規分布に従う乱数を生成
  rnorm(half_n, mean = 70, sd = 5)# 期待値70、標準偏差5の正規分布に従う乱数を生成
  )

# 各個体の30日間の活動量を生成
# 期待値expected_values、標準偏差10の正規分布に従う乱数を生成
set.seed(0)
activity_data <- purrr::map(expected_values, \(x) rnorm(n_days, mean = x, sd = 10)) |>
  list_c() |>
  matrix(ncol = 30) |>
  as.data.frame() |>
  set_names(paste0("v", formatC(1:30, width=2, flag="0"))) |>
  mutate(days = 1:30) |>
  pivot_longer(!days) |>
  group_by(days) |>
  mutate(
    ev = expected_values,
    group = c(rep("a", 15), rep("b", 15))
    ) |>
  ungroup()

# 全個体をそれぞれ違う色で描画
p1 <- activity_data |>
  ggplot(aes(x = days, y = value, color = name)) +
  geom_line() +
  labs(title = "すべての個体を異なる色で描画", x = "経過日数", y = "活動量スコア") +
  theme(
    legend.position="none"
    , axis.title.x=element_blank()
    , axis.title.y=element_blank()
  )

# 期待値最大の個体を強調
p2 <- activity_data |>
  mutate(color = if_else(ev == max(expected_values), "a", "b")) |>
  ggplot(aes(x = days, y = value, color = color, group=name)) +
  geom_line() +
  scale_color_manual(values = c("red", "grey")) +
  labs(title = "着目する個体だけ異なる色で描画", x = "経過日数", y = "活動量スコア") +
  theme(
    legend.position="none"
    , axis.title.x=element_blank()
    , axis.title.y=element_blank()
  )

# 15個体のグループごとに描画
p3 <- activity_data |>
  ggplot(aes(x = days, y = value, color = group, group=name)) +
  geom_line() +
  labs(title = "グループごとに異なる色で描画", x = "経過日数", y = "活動量スコア") +
  theme(
    legend.position="none"
    , axis.title.x=element_blank()
    , axis.title.y=element_blank()
  )

# 15個体のグループごとに平均と標準偏差で描画
p4 <- activity_data |>
  summarise(
    mean = mean(value),
    sd = sd(value),
    .by = c(group, days)
  ) |>
  ggplot(aes(x = days, y = mean, color = group, group=group)) +
  geom_ribbon(aes(ymin = mean - sd, ymax = mean + sd, fill = group), alpha=0.2) +
  geom_line() +
  labs(title = "グループごとに平均と標準偏差で描画", x = "経過日数", y = "活動量スコア") +
  theme(
    legend.position="none"
    , axis.title.x=element_blank()
    , axis.title.y=element_blank()
  ) +
  coord_cartesian(ylim = c(20,100))

(p1 + p2 )/(p3 + p4)

3.2.3
library(conflicted)
library(tidyverse)

# ランダムな体重データを生成
set.seed(0)
weights_1 <- rnorm(50, 60, 2) # 平均60、分散4の正規分布
weights_2 <- weights_1 + rnorm(50, -1, 0.5) # 平均-1、分散0.25の正規分布
weights_3 <- weights_1 + rnorm(50, 0, 0.5) # 平均0、分散0.25の正規分布(追加)

# データフレームを作成
df <- data.frame(
  Weight = c(weights_1, weights_2),
  Group = factor(
    c(rep("実験開始時", 50), rep("一か月後",50)),
    levels=c("実験開始時", "一か月後")
    ),
  Person = c(seq(50), seq(50))
  )
  
# 点のみプロット
p1 <- df |>
  ggplot(aes(x = Group, y = Weight)) +
  geom_jitter(width = 0.05, height = 0, alpha = 0.5) +
  labs(x = "期間", y = "体重 [kg]") +
  coord_cartesian(ylim = c(55, 65)) +
  labs(title = "ストリッププロット") +
  theme(axis.title.x=element_blank()) 

df2 <- df |>
  pivot_wider(names_from = Group, values_from = Weight) |>
  mutate(diff = 一か月後 - 実験開始時) |>
  pivot_longer(!c(Person, diff), names_to = "Group", values_to = "Weight") |>
  mutate(Group = factor(Group, levels=c("実験開始時", "一か月後"))) 

# スロープグラフ
p2 <- df2 |>
  ggplot(aes(x = Group, y = Weight, group = Person, color = diff)) +
  geom_line(alpha = 0.5) +
  geom_point(alpha = 0.5) +
  scale_color_gradient2(
    midpoint = 0,
    limit = c(-2,2),
    low = "blue",
    mid = "white",
    high = "red",
    guide = "colorbar"
  ) +
  coord_cartesian(ylim = c(55, 65)) +
  labs(title = "スロープグラフ") +
  theme(
    axis.title.x=element_blank()
    , axis.title.y=element_blank()
  ) 

# スロープグラフ2
p3 <- data.frame(
  Weight = c(weights_1, weights_3),
  Group = factor(
    c(rep("実験開始時", 50), rep("一か月後",50)),
    levels=c("実験開始時", "一か月後")
  ),
  Person = c(seq(50), seq(50))
) |>
  pivot_wider(names_from = Group, values_from = Weight) |>
  mutate(diff = 一か月後 - 実験開始時) |>
  pivot_longer(!c(Person, diff), names_to = "Group", values_to = "Weight") |>
  mutate(Group = factor(Group, levels=c("実験開始時", "一か月後"))) |>
  ggplot(aes(x = Group, y = Weight, group = Person, color = diff)) +
  geom_line(alpha = 0.5) +
  geom_point(alpha = 0.5) +
  scale_color_gradient2(
    midpoint = 0,
    limit = c(-2,2),
    low = "blue",
    mid = "white",
    high = "red",
    guide = "colorbar"
    ) +
  coord_cartesian(ylim = c(55, 65)) +
  labs(title = "スロープグラフ") +
  theme(
    legend.position="none"
    , axis.title.x=element_blank()
    , axis.title.y=element_blank()
  ) 

p1 + p3 + p2

3.2.4
library(conflicted)
library(tidyverse)
library(forecast)
library(patchwork)

# データの読み込み
df <- read_csv('https://raw.githubusercontent.com/selva86/datasets/master/a10.csv')

# 時系列データに変換
ts_df <- ts(
  df$value,
  start=c(year(df$date[1]), month(df$date[1])),
  frequency=12
  )

# 時系列データを分解
decomposed_add <- decompose(ts_df, type="additive") #加算
decomposed_mlt <- decompose(ts_df, type="multiplicative") #乗算

# プロット作成
p1 <- decomposed_add |>
  autoplot() +
  labs(title = "足し算で分解")

p2 <- decomposed_mlt |>
  autoplot() +
  labs(title = "掛け算で分解")

p1 + p2

3.3.1
library(conflicted)
library(tidyverse)
library(GGally)
library(palmerpenguins)

penguins |>
  ggpairs(
    columns = c("bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"),
    columnLabels = c("くちばしの長さ [mm]", "くちばしの厚さ [mm]", "ひれの長さ [mm]", "体重 [g]"),
    aes(color = species),
    diag = list(continuous = wrap("densityDiag", alpha = .5))
    )

3.3.2
library(conflicted)
library(tidyverse)
library(palmerpenguins)

penguins |>
  ggplot(aes(x = bill_length_mm, y = body_mass_g, color = species, fill = species)) +
  geom_point(alpha=0.3) +
  geom_smooth(method = "lm") +
  labs(x = "くちばしの長さ [mm]", y = "体重 [g]") +
  theme(aspect.ratio = 1)

3.3.3
library(conflicted)
library(tidyverse)
library(palmerpenguins)
library(rsample)
library(MLmetrics)

# データの準備
pen <- penguins |>
  dplyr::select(body_mass_g, bill_length_mm, bill_depth_mm, flipper_length_mm, species) |>
  drop_na()

pen_list <- split(pen, pen$species)

# 分割
pen_split <- pen_list |>
  purrr::map(\(df) initial_split(df, prop = 0.8, strata = "body_mass_g"))
  
pen_train <-  pen_split |>
  purrr::map(\(df) training(df))

pen_test <-  pen_split |>
  purrr::map(\(df) testing(df))

# 学習
pen_lm <- pen_train |>
  purrr::map(\(df) lm(body_mass_g ~ bill_length_mm + bill_depth_mm + flipper_length_mm, data = df))

# 検証
pen_pred <- pen_lm |>
  purrr::map2(pen_test, \(x, y) predict(x, newdata = y))

# 評価(RMSE)
pen_true <- pen_test |>
  purrr::map(pluck("body_mass_g"))
pen_pred |>
  purrr::map2_dbl(pen_true, \(y_pred, y_true) RMSE(y_pred, y_true))
##    Adelie Chinstrap    Gentoo 
##  342.7467  267.0557  338.0703
# Adelie Chinstrap    Gentoo 
# 336.5512  305.2450  313.6054 

# 散布図の作成
data.frame(
  y_true = pen_true |> unlist(),
  y_pred = pen_pred |> unlist()
  ) |>
  rownames_to_column(var = "species") |>
  mutate(species = str_remove(species, "[0-9]+")) |>
  ggplot(aes(x = y_true, y = y_pred, color = species)) +
    geom_point(alpha = 0.6) +
    labs(x = "実際の体重 [g]", y = "モデルによる予測 [g]", title = "線形回帰による予測") +
    geom_abline(intercept = 0, slope = 1) +
  coord_cartesian(xlim = c(3000, 6000), ylim = c(3000, 6000)) +
  theme(aspect.ratio = 1)

3.3.4
library(conflicted)
library(tidyverse)
library(patchwork)

# SNSデータの読み込み
df_sns <- read_csv("https://raw.githubusercontent.com/morimotoosamu/data_visualization/main/data/sns.csv")

# 体重データを生成
df_weights <- tibble(
  Weight1 = rnorm(50, mean = 60, sd = sqrt(2)), # 平均60、分散2の正規分布,
  Weight2 = Weight1 + rnorm(50, mean = -1, sd = sqrt(0.5)) # 平均-1、分散0.5の正規分布
)

# ggplotを用いたグラフ描画
p1 <- df_weights |>
  ggplot(aes(x = Weight1, y = Weight2)) + 
  geom_point(color = "orange") +
  geom_abline(intercept = 0, slope = 1, color = "black") +
  labs(
    title = "同じ量の2時点の比較",
    x = "実験開始時体重 [kg]",
    y = "一か月後体重 [kg]") +
  coord_cartesian(xlim = c(56, 64), ylim = c(56, 64)) +
  theme(aspect.ratio = 1)

p2 <- df_sns |>
  ggplot(aes(x = Sent, y = Received)) + 
  geom_point(color = "lightblue") +
  geom_abline(intercept = 0, slope = 1, color = "black") +
  labs(
    title = "ペア間の双方向の量の比較",
    x = "送信したリプライ数",
    y = "受信したリプライ数"
    ) +
  coord_cartesian(xlim = c(0, 31), ylim = c(0, 31)) +
  theme(aspect.ratio = 1)

p1 + p2

3.3.5
library(conflicted)
library(tidyverse)
library(patchwork)

set.seed(0)

data <- bind_rows(
  tibble(# 相関の少ないデータの生成
    x = rnorm(100, 25, 10), # 平均25、分散10の正規分布
    y = rnorm(100, 75, 10)  # 平均75、分散10の正規分布
    ),
  tibble(# 相関の多いデータの生成
    x = rnorm(500, 50, 30), # 平均50、分散30の正規分布
    y = 0.5 * x + rnorm(500, 0, 10) # y = 0.5x + ノイズ
    ),
  tibble(
     x = rnorm(400, 25, 20), # 平均25、分散20の正規分布
     y = rnorm(400, 25, 30)  # 平均25、分散30の正規分布
     )
  )

# グラフ生成
p1 <- data |>
  ggplot(aes(x =x, y = y)) +
  geom_point() +
  coord_fixed() +
  labs(title = "不透明マーカー利用")

# 中央のサブプロットに統合データの半透明なマーカー散布図をプロット
p2 <- data |>
  ggplot(aes(x =x, y = y)) +
  geom_point(alpha = 0.1) +
  coord_fixed() +
  labs(title = "半透明マーカー利用")

# ヒートマップを生成
jet.colors <- colorRampPalette(
  c("#00007F", "blue", "#007FFF", "cyan", "#7FFF7F", "yellow", "#FF7F00", "red",
    "#7F0000"))

p3 <- data |>
  ggplot(aes(x = x, y = y)) +
  stat_density2d(aes(fill = after_stat(density)), geom="tile", contour=FALSE, n = 20) +
  scale_fill_gradientn(colours = jet.colors(10)) +
  coord_fixed() +
  labs(title = "ヒートマップ")

p1 + p2 +p3

3.3.6

library(conflicted)
library(tidyverse)
library(ggrepel)
library(janitor)

# データの読み込み
df <- read_csv(
  "https://raw.githubusercontent.com/tkEzaki/data_visualization/main/3%E7%AB%A0/data/bubble_chart_data.csv"
  ) |>
  clean_names() |>
  dplyr::filter(x2021_yr2021 != "..") |>
  mutate(x2021_yr2021 = as.numeric(x2021_yr2021))

region_df <- read_csv(
  "https://raw.githubusercontent.com/tkEzaki/data_visualization/main/3%E7%AB%A0/data/bubble_chart_region_data.csv"
  ) |>
  clean_names() 

# データの前処理
df_split <- df |>
  split(df$series_name)

names(df_split) <- c("gdp", "life_exp", "population")

# GDP, Life expectancy, Populationのデータをマージ
data <- df_split$gdp |>
  dplyr::select(country_name, country_code, x2021_yr2021) |>
  left_join(
    df_split$life_exp |>
      dplyr::select(country_name, x2021_yr2021),
    by = join_by(country_name),
    suffix = c("_GDP", "_Life_Expectancy")
    ) |>
  left_join(
    df_split$population |>
      dplyr::select(country_name, x2021_yr2021),
    by = join_by(country_name)
    ) |>
  rename(
    x2021_yr2021_Population = x2021_yr2021
  ) |>
  left_join( #regionをCountry Codeでjoin
    region_df |>
      dplyr::select(alpha_3, region),
    by = join_by(country_code == alpha_3)
    ) |>
  left_join( #regionをCountry_Nameでjoin
    region_df |>
      dplyr::select(name, region),
    by = join_by(country_name == name)
  ) |>
  mutate( #region確定
    region = if_else(is.na(region.x), region.y, region.x),
    .keep = "unused"
    ) |>
  drop_na() |> #region無い行を削除
  mutate(# GDP per capita(千ドル)を計算
    GDP_per_capita = x2021_yr2021_GDP / x2021_yr2021_Population / 1000,
    x2021_yr2021_Population = x2021_yr2021_Population / 100000
  )

# 描画
data |>
  ggplot(aes(
    x = GDP_per_capita,
    y = x2021_yr2021_Life_Expectancy,
    color = region,
    size = x2021_yr2021_Population,
    label = country_name
    )) +
  geom_point(alpha = 0.6) +
  scale_size_area(max_size = 11) + #値と面積が比例するように
  geom_text_repel(size = 3) +
  scale_x_log10() +
  labs(
    title = "色や大きさで情報を付加する",
    x = "一人あたりGDP [1,000$](対数軸)",
    y = "平均寿命 [年]", 
    size = "人口(10万人)", 
    color = "地域"
    ) +
  theme(aspect.ratio = 1)

第3章はここまで。